Global time series analysis¶
Load libraries¶
import warnings
from functools import partial
import covid_analysis.utils.paths as path
import janitor
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_flavor as pf
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
from plotly.offline import init_notebook_mode
from pmdarima.arima import ADFTest, auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, adfuller, pacf
Set defaults for plots¶
# matplotlib
plt.style.use("seaborn-whitegrid")
plt.rcParams["figure.figsize"] = (10, 8)
# seaborn
sns.set_style("whitegrid")
# plotly
init_notebook_mode()
pio.templates.default = "plotly_white"
pd.options.plotting.backend = "plotly"
# Some plot warninigs
warnings.filterwarnings("ignore")
Utility functions¶
def series_corr_plot(series, plot_pacf=False, alpha=0.05, **kwargs):
if plot_pacf:
title = "Partial Autocorrelation (PACF)"
corr_array = pacf(series.dropna(), alpha=alpha, **kwargs)
else:
title = "Autocorrelation (ACF)"
corr_array = acf(series.dropna(), alpha=alpha, **kwargs)
correlation = corr_array[0]
n_lags = len(correlation)
lags = np.arange(n_lags)
lower_y = corr_array[1][:, 0] - corr_array[0]
upper_y = corr_array[1][:, 1] - corr_array[0]
fig = go.Figure()
# Add vertical lines
for i_lag in lags:
fig.add_scatter(
x=(i_lag, i_lag),
y=(0, correlation[i_lag]),
mode="lines",
line_color="#3f3f3f",
showlegend=False,
hoverinfo='none'
)
# Add points
fig.add_scatter(
x=lags,
y=correlation,
name="Correlation",
mode="markers",
marker_color="#1f77b4",
marker_size=12,
showlegend=False
)
# Add confidence interval
fig.add_scatter(
name="Upper Bound",
x=lags,
y=upper_y,
mode="lines",
line_color="rgba(255,255,255,0)",
showlegend=False,
)
fig.add_scatter(
name="Lower Bound",
x=lags,
y=lower_y,
mode="lines",
line_color="rgba(255,255,255,0)",
fillcolor="rgba(32, 146, 230,0.3)",
fill="tonexty",
showlegend=False
)
fig.update_yaxes(zerolinecolor='#000000')
fig.update_layout(
title=title,
xaxis_title="Number of lags",
hovermode="x unified"
)
return fig
def print_adfuller(adfuller_result):
print(f"""
ADF Statistic: {adfuller_result[0]}
p-value: {adfuller_result[1]}
Critical values {adfuller_result[4]}
""")
Define input and output directory¶
input_dir = path.data_processed_dir()
output_dir = path.models_dir()
Load data¶
Confirmed and death time series¶
hopkins_tidy_cumulative_df = (
pd.read_csv(
filepath_or_buffer=input_dir.joinpath("hopkins_tidy_cumulative.csv")
)
.transform_column("date", pd.to_datetime)
)
hopkins_tidy_cumulative_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118755 entries, 0 to 118754
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 country 118755 non-null object
1 date 118755 non-null datetime64[ns]
2 confirmed 118755 non-null int64
3 deaths 118755 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 3.6+ MB
Vaccination¶
vaccination_tidy_cumulative_df = (
pd.read_csv(
filepath_or_buffer=input_dir.joinpath("vaccination_country_cumulative.csv")
)
)
vaccination_tidy_cumulative_df.head(1)
| country | date | doses_admin | people_partially_vaccinated | people_fully_vaccinated | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 2021-02-22 | 0 | 0.0 | 0.0 |
Transform cumulative observations to daily changes in global level¶
hopkins_tidy_global_daily_df = (
hopkins_tidy_cumulative_df
.remove_columns("country")
.groupby("date")
.sum()
.diff()
.dropna()
.reset_index()
)
hopkins_tidy_global_daily_df.head(1)
| date | confirmed | deaths | |
|---|---|---|---|
| 0 | 2020-01-23 | 98.0 | 1.0 |
Pandemic behavior to date¶
(
hopkins_tidy_global_daily_df
.set_index("date")
.pipe(
lambda df: (
df
.join(
df.rolling("7D").mean(),
rsuffix="_mean"
)
.join(
df.rolling("7D").std(),
rsuffix="_std"
)
)
)
.reset_index()
.pivot_longer(
index="date",
names_to="metric"
)
.assign(
variable=lambda df: df.metric.str.extract(r"(confirmed|deaths)"),
metric=lambda df: (
df.metric.str.extract(r"(?:confirmed|deaths)_?(.+)")
.fillna("Daily")
.replace(
{
"mean": "Rolling mean (7 days)",
"std": "Rolling std (7 days)"
}
)
)
)
.pipe(
lambda df: (
px.line(
df,
x="date",
y="value",
color="metric",
facet_row="variable",
labels=dict(
date="Date",
value="Observations number",
metric="Trace",
variable="Variable"
)
)
.update_yaxes(matches=None)
.update_layout(
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02
),
margin={
"r": 0.7,
"l": 0,
},
hovermode="x unified"
)
)
)
)
Fatality rate¶
Cumulative¶
(
hopkins_tidy_cumulative_df
.groupby("date")
.sum()
.assign(
fatality_rate=lambda df: (df.deaths / df.confirmed * 100).round(2)
)
.reset_index()
.pipe(
lambda df: (
px.area(
df,
x="date",
y="fatality_rate",
labels=dict(
date="Date",
fatality_rate="Fatality rate"
)
)
.update_layout(
title="Cumulative",
yaxis=dict(ticksuffix="%"),
margin={
"r": 0.7,
"l": 0,
},
)
)
)
)
(
hopkins_tidy_global_daily_df
.assign(
fatality_rate=lambda df: (df.deaths / df.confirmed * 100).round(2)
)
.reset_index()
.pipe(
lambda df: (
px.area(
df,
x="date",
y="fatality_rate",
labels=dict(
date="Date",
fatality_rate="Fatality rate"
)
)
.update_layout(
title="Daily",
yaxis=dict(ticksuffix="%"),
margin={
"r": 0.7,
"l": 0,
},
)
)
)
)
Analysis of death time series¶
Subset deaths time series¶
global_daily_deaths = (
hopkins_tidy_global_daily_df
.select_columns(["date", "deaths"])
.set_index("date")
)
(
global_daily_deaths
.plot()
.update_layout(
xaxis_title="Date",
yaxis_title="Number of deaths",
showlegend=False
)
)
Find critical points in the behavior of the time series¶
Daily autocorrelation¶
series_corr_plot(global_daily_deaths, nlags=30, fft=False)
Check for tranformations and differentiation of data¶
adf_test = ADFTest(alpha=0.01)
print(adf_test.should_diff(global_daily_deaths))
print(adf_test.should_diff(global_daily_deaths.diff().dropna()))
(0.9040319035599506, True)
(0.01, False)
transform_serie_to_stationary = dict(
no_transformation=lambda x: x,
just_differencing=lambda x: x.diff().dropna(),
differencing_with_log=lambda x: x.apply(np.log).diff().dropna(),
differencing_with_sqrt=lambda x: x.apply(np.sqrt).diff().dropna(),
)
for transformer_type, transformer in transform_serie_to_stationary.items():
print(transformer_type)
print_adfuller(adfuller(transformer(global_daily_deaths)))
no_transformation
ADF Statistic: -2.5781238684028427
p-value: 0.09760343324866938
Critical values {'1%': -3.4415011513018263, '5%': -2.8664595311890215, '10%': -2.569389981494346}
just_differencing
ADF Statistic: -4.596225208142454
p-value: 0.00013125488311554312
Critical values {'1%': -3.4415393130846725, '5%': -2.866476335860869, '10%': -2.5693989358590006}
differencing_with_log
ADF Statistic: -3.054826473456103
p-value: 0.03008192358388
Critical values {'1%': -3.4415393130846725, '5%': -2.866476335860869, '10%': -2.5693989358590006}
differencing_with_sqrt
ADF Statistic: -4.053619224065926
p-value: 0.001154357544876195
Critical values {'1%': -3.4415393130846725, '5%': -2.866476335860869, '10%': -2.5693989358590006}
global_differenced_deaths = global_daily_deaths.diff().dropna()
Show fist-order differencing¶
(
global_differenced_deaths
.pipe(
lambda df: (
df
.join(
df.rolling("7D").mean(),
rsuffix="_mean"
)
.join(
df.rolling("7D").std(),
rsuffix="_std"
)
)
)
.reset_index()
.pivot_longer(
index="date",
names_to="metric"
)
.assign(
variable=lambda df: df.metric.str.extract(r"(confirmed|deaths)"),
metric=lambda df: (
df.metric.str.extract(r"(?:confirmed|deaths)_?(.+)")
.fillna("Daily")
.replace(
{
"mean": "Rolling mean (7 days)",
"std": "Rolling std (7 days)"
}
)
)
)
.pipe(
lambda df: (
px.line(
df,
x="date",
y="value",
color="metric",
facet_row="variable",
labels=dict(
date="Date",
value="Observations number",
metric="Trace",
variable="Variable"
)
)
.update_yaxes(matches=None)
.update_layout(
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02
),
margin={
"r": 0.7,
"l": 0,
},
hovermode="x unified"
)
)
)
)
Clarify seasonal parameters signal¶
series_corr_plot(global_differenced_deaths, nlags=30, fft=True)
series_corr_plot(global_differenced_deaths, nlags=30, plot_pacf=True)
Show seasonal differencing¶
(
global_differenced_deaths
.diff(8)
.dropna()
.pipe(
lambda df: (
df
.join(
df.rolling("7D").mean(),
rsuffix="_mean"
)
.join(
df.rolling("7D").std(),
rsuffix="_std"
)
)
)
.reset_index()
.pivot_longer(
index="date",
names_to="metric"
)
.assign(
variable=lambda df: df.metric.str.extract(r"(confirmed|deaths)"),
metric=lambda df: (
df.metric.str.extract(r"(?:confirmed|deaths)_?(.+)")
.fillna("Daily")
.replace(
{
"mean": "Rolling mean (7 days)",
"std": "Rolling std (7 days)"
}
)
)
)
.pipe(
lambda df: (
px.line(
df,
x="date",
y="value",
color="metric",
facet_row="variable",
labels=dict(
date="Date",
value="Observations number",
metric="Trace",
variable="Variable"
)
)
.update_yaxes(matches=None)
.update_layout(
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02
),
margin={
"r": 0.7,
"l": 0,
},
hovermode="x unified"
)
)
)
)
Seasonal decomposition¶
decompose_res = seasonal_decompose(global_daily_deaths, period=7)
(
pd.concat(
[
decompose_res.observed,
decompose_res.trend,
decompose_res.seasonal,
decompose_res.resid
],
axis=1
)
.rename_column(0, "observed")
.reset_index()
.pivot_longer(index="date")
.pipe(
lambda df: (
px.line(
df,
x="date",
y="value",
color="variable",
facet_row="variable",
labels=dict(
date="Date",
value="Value"
)
)
.update_yaxes(matches=None)
.update_layout(showlegend=False, hovermode="x")
.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1].capitalize()))
)
)
)
Forecating¶
Utility functions¶
custom_auto_arima = partial(
auto_arima,
start_p=0,
start_q=0,
max_p=10,
max_q=10,
d=1, # Non-seasonal difference order
seasonal=True, # Is the time series seasonal?
m=7, # the seasonal period
D=1, # seasonal difference order
start_P=0,
start_Q=0,
max_P=3,
max_Q=3,
stepwise=True,
error_action="ignore",
suppress_warnings=True,
information_criterion="aic" # used to select best model
)
def run_auto_arima(series: pd.Series, n_forecast: int = 21):
arima_model = custom_auto_arima(series)
new_dates_forecast = pd.date_range(
start=series.index[-1] + pd.DateOffset(1),
end=series.index[-1] + pd.DateOffset(n_forecast)
)
predictions, conf_interval = arima_model.predict(n_forecast, return_conf_int=True)
lower_predicitons, upper_predictions = conf_interval[:, 0], conf_interval[:, 1]
original_predicted_df = pd.DataFrame(
dict(
original=series,
predicted=arima_model.predict_in_sample()
)
).reset_index()
forecast_df = pd.DataFrame(
dict(
date=new_dates_forecast,
forecast=predictions,
lower_bound=lower_predicitons,
upper_bound=upper_predictions
)
)
return arima_model, original_predicted_df, forecast_df
Find an ARIMA model for the data¶
arima_model, original_predicted_df, forecast_df = run_auto_arima(global_daily_deaths.deaths, n_forecast=21)
arima_model.summary()
| Dep. Variable: | y | No. Observations: | 608 |
|---|---|---|---|
| Model: | SARIMAX(0, 1, 1)x(0, 1, 1, 7) | Log Likelihood | -5049.392 |
| Date: | Wed, 22 Sep 2021 | AIC | 10104.785 |
| Time: | 02:47:00 | BIC | 10117.975 |
| Sample: | 0 | HQIC | 10109.920 |
| - 608 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ma.L1 | -0.7071 | 0.018 | -40.038 | 0.000 | -0.742 | -0.672 |
| ma.S.L7 | -0.7175 | 0.022 | -32.030 | 0.000 | -0.761 | -0.674 |
| sigma2 | 1.189e+06 | 2.14e+04 | 55.665 | 0.000 | 1.15e+06 | 1.23e+06 |
| Ljung-Box (L1) (Q): | 1.11 | Jarque-Bera (JB): | 19880.27 |
|---|---|---|---|
| Prob(Q): | 0.29 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 5.83 | Skew: | 2.55 |
| Prob(H) (two-sided): | 0.00 | Kurtosis: | 30.74 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Mean Absolute Error (MAE)¶
mae = np.mean(np.abs(arima_model.resid()))
mae
670.5356217262654
Model diagnostics¶
arima_model.plot_diagnostics();
Plot model results¶
(
original_predicted_df
.set_index("date")
.plot()
.add_scatter(
name="Forecast",
x=forecast_df.date,
y=forecast_df.forecast,
mode="lines",
)
.add_scatter(
name="Upper Bound",
x=forecast_df.date,
y=forecast_df.upper_bound,
mode="lines",
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False
)
.add_scatter(
name="Lower Bound",
x=forecast_df.date,
y=forecast_df.lower_bound,
marker=dict(color="#444"),
line=dict(width=0),
mode='lines',
fillcolor='rgba(68, 68, 68, 0.3)',
fill='tonexty',
showlegend=False
)
.update_layout(
xaxis_title="Date",
yaxis_title="Number of deaths",
hovermode="x unified",
legend=dict(
title="Variable",
orientation="h",
yanchor="bottom",
y=1.02
),
margin={
"r": 0.7,
"l": 0,
},
)
)
Save model¶
model_path = output_dir.joinpath("covid_death_time_series.pkl")
joblib.dump(arima_model, model_path);
Expected deaths statistics in the next three weeks¶
forecast_df.remove_columns("date").describe()
| forecast | lower_bound | upper_bound | |
|---|---|---|---|
| count | 21.000000 | 21.000000 | 21.000000 |
| mean | 7534.192405 | 4288.124286 | 10780.260524 |
| std | 1514.365608 | 1897.179938 | 1462.290332 |
| min | 4515.846646 | 260.190708 | 7680.382104 |
| 25% | 6650.907882 | 3087.947466 | 9993.530959 |
| 50% | 7847.368493 | 4684.894969 | 11118.055838 |
| 75% | 8745.263770 | 5905.214643 | 11789.652488 |
| max | 9562.897120 | 7336.141753 | 12777.849639 |
print(original_predicted_df.tail(21).original.sum())
print(original_predicted_df.tail(7).original.sum() * 3)
187860.0
175392.0
model_text_res = f"""Predicted number of deaths:
Actually predicted: {forecast_df.forecast.sum().round()}
Lowest: {forecast_df.lower_bound.sum().round()}
Highest: {forecast_df.upper_bound.sum().round()}
"""
print(model_text_res)
Predicted number of deaths:
Actually predicted: 158218.0
Lowest: 90051.0
Highest: 226385.0
Vaccination¶
Global vaccinations time series¶
global_vaccination_cumulative_df = (
vaccination_tidy_cumulative_df
.groupby("date")
.sum()
)
(
global_vaccination_cumulative_df
.reset_index()
.pivot_longer(
index="date"
)
.assign(
variable=lambda df: (
df
.variable
.replace(
dict(
doses_admin="Doses administraded",
people_partially_vaccinated="People partially vaccinated",
people_fully_vaccinated="People fully vaccinated"
)
)
)
)
.pipe(
lambda df: (
px.line(
df,
x="date",
y="value",
color="variable",
labels=dict(
date="Date",
value="Value",
variable="Variable"
)
)
.update_layout(
hovermode="x unified",
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02
)
)
)
)
)